import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as py
import seaborn as sns
import folium
from folium.plugins import TimeSliderChoropleth
from branca.element import Figure
from scipy import stats
import statsmodels.api as sm
import geopandas as gpd
sns.set()
import urllib
import os
# go to tlc web
# get the url link address
months = ["12", "01", "02"]
years = [2017, 2018]
for year in years:
for month in months:
fname=os.getcwd()+f'/yellow_tripdata_{year}-{month}.csv'
url = f'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year}-{month}.csv'
urllib.request.urlretrieve(url, fname)
import urllib
import os
#month_year = [[2016, "12"], [2019, "01"], [2019, "02"]]
month_year = [[2016, "12"]]
for date in month_year:
fname=os.getcwd()+f'/yellow_tripdata_{date[0]}-{date[1]}.csv'
url = f'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{date[0]}-{date[1]}.csv'
urllib.request.urlretrieve(url, fname)
taxi = pd.read_csv("yellow_tripdata_2016-12.csv")
taxi = taxi.reset_index()
taxi.drop(labels=[1, 2], axis=1, inplace = True)
taxi.columns = list(taxi.columns[2:])+[1, 2]
taxi.to_csv("yellow_tripdata_2016-12.csv", index=False)
# getting the snowfall data
snow_data = pd.read_csv('snow.csv')
snow_data.head()
snow_data.info()
# only for flag
snow_data.loc[snow_data['New Snow']=='T', 'New Snow'] = -100
snow_data.loc[snow_data['New Snow']=='M', 'New Snow'] = -100
snow_data.loc[snow_data['Snow Depth']=='T', 'Snow Depth'] = -100
snow_data.loc[snow_data['Snow Depth']=='M', 'Snow Depth'] = -100
lst = ['New Snow', 'Snow Depth']
for each_snow in lst:
snow_data[each_snow] = snow_data[each_snow].apply(lambda x: float(x))
# Since this is a weather data, I made a decision to take an average of the past 2 days
for each_snow in lst:
index_replaced = snow_data[snow_data[each_snow]==-100].index
for each_index in index_replaced:
snow_data.loc[each_index, each_snow] = np.mean(snow_data.loc[each_index-2:each_index-1, each_snow])
for each_snow in lst:
snow_data[each_snow] = round(snow_data[each_snow], 1)
lst_temp = ['MaxTemp', 'MinTemp', 'Average']
for temp in lst_temp:
snow_data[temp] = snow_data[temp].apply(lambda x: int((x-32)*5/9))
snow_data.to_csv('snow.csv')
Done for snow data
taxi = pd.read_csv('yellow_tripdata_2018-12.csv')
newark = taxi[taxi['RatecodeID']==3]
newark[newark['fare_amount']<17.5]
type(taxi['tpep_pickup_datetime'][0])
taxi['tpep_dropoff_datetime'] = pd.to_datetime(taxi['tpep_dropoff_datetime'])
taxi['tpep_pickup_datetime'] = pd.to_datetime(taxi['tpep_pickup_datetime'])
taxi['time_taken'] = taxi['tpep_dropoff_datetime']-taxi['tpep_pickup_datetime']
taxi['time_taken'] = taxi['time_taken'].apply(lambda x: x.seconds/3600)
taxi['time_taken'] = taxi['time_taken'].astype('float')
#taxi['time_taken'] = taxi['time_taken'].astype('float')
taxi['speed'] = taxi['trip_distance']/taxi['time_taken']
taxi['speed']
# example of dirty date
taxi['Date'] = taxi['tpep_pickup_datetime'].apply(lambda x: x.split()[0])
taxi['Date'].unique()
taxi = taxi[taxi['speed']<55]
taxi = taxi[taxi['time_taken']>0]
taxi['trip_count'] = 1
taxi_201701 = taxi.groupby(by='Date').sum().reset_index()
taxi_201701_mean = taxi.groupby(by='Date').mean().reset_index()
taxi.isnull().sum()
taxi.head()
taxi[taxi['passenger_count']==0].shape
taxi['RatecodeID'].unique()
taxi['Date'] = taxi['tpep_pickup_datetime'].apply(lambda x: x.split()[0])
taxi['time_taken'] = taxi['time_taken'].apply(lambda x: x.seconds/3600)
taxi['payment_type']
def preprocessing(month, year, filename):
data = pd.read_csv(filename)
data = data[data['passenger_count']>0]
# take out the payment by credit for tip amount
data = data.loc[data['payment_type'].isin([1, 2]), :]
data = data.loc[data['RatecodeID'].isin([1, 2, 3, 4, 5, 6]), :]
# get only trip distance that is positive and also fare amount
data = data[data['trip_distance']>0]
# nyc minimum fare is 2.5
data = data[data['fare_amount']>2.5]
if month=='01' or month=='12':
date = [i for i in range(1, 32)]
else:
date = [i for i in range(1, 29)]
# Extract the date
data['Date'] = data['tpep_pickup_datetime'].apply(lambda x: x.split()[0])
# clean the date
lst_date = data['Date'].unique()
lst_date = [i for i in lst_date if i[5:7] == month and i[0:4] == year and int(i[8:]) in date]
data = data[data['Date'].isin(lst_date)]
data = data.loc[:, ['tpep_pickup_datetime', 'tpep_dropoff_datetime',
'trip_distance', 'RatecodeID', 'PULocationID',
'DOLocationID','tip_amount', 'total_amount','fare_amount','extra', 'tolls_amount','Date',
'payment_type']]
# check the speed and time
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
data['time_taken'] = data['tpep_dropoff_datetime']-data['tpep_pickup_datetime']
data['time_taken'] = data['time_taken'].apply(lambda x: x.seconds/3600)
data['time_taken'] = data['time_taken'].astype('float')
data['speed'] = data['trip_distance']/data['time_taken']
# check the speed limit
#data = data[data['speed']<=55]
data = data[data['time_taken']>0]
data = data[data['speed']>0]
data.reset_index().to_feather(filename[:-3]+'feather')
del data
# first layer of preprocessing
#dates = [['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
dates = [['12', '2016']]
for each_dates in dates:
print('Month:', each_dates[0])
print('Year:', each_dates[1])
preprocessing(each_dates[0], each_dates[1], f'yellow_tripdata_{each_dates[1]}-{each_dates[0]}.csv')
zone = pd.read_csv('taxi+_zone_lookup.csv')
taxi = pd.read_feather('yellow_tripdata_2018-12.feather')
taxi.columns
zone[zone['LocationID']==52]
jfk_check = taxi[taxi['RatecodeID']==2]
jfk_check[jfk_check['fare_amount']!=52]
Attributes that is prone to logic error in its data: speed, time taken, trip distance, fare amount, tolls and extra
inv_data = taxi[['speed', 'time_taken', 'trip_distance', 'fare_amount', 'tolls_amount', 'extra']]
inv_data.describe()
inv_data = inv_data[inv_data['fare_amount']>0]
min(inv_data['speed'])
inv_data['ratio_fare_speed'] = inv_data['fare_amount']/inv_data['speed']
inv_data['ratio_fare_speed'].describe()
inv_data[inv_data['ratio_fare_speed']==max(inv_data['ratio_fare_speed'])]
As you can see, this ratio can capture those data that does not make any sense, as 600 miles can be taken with only 0.04 seconds.
inv_data = inv_data[inv_data['ratio_fare_speed']<inv_data['ratio_fare_speed'].quantile(q=0.975)]
inv_data = inv_data[inv_data['ratio_fare_speed']>inv_data['ratio_fare_speed'].quantile(q=0.025)]
inv_data.isnull().sum()
plt.figure(figsize=(12, 8))
plt.plot(inv_data['speed'], inv_data['fare_amount'])
plt.show()
Clearly, it can be seen that the data contain some errors, for example, the max speed is 63272 miles per hour, which does not make any sense.
taxi[taxi['tolls_amount']==max(taxi['tolls_amount'])]
zone[zone['LocationID'].isin([90, 11])]
inv_data[inv_data['tolls_amount']>100]
inv_data[inv_data['extra']<0]
def second_preprocessing(month, year, filename):
# read the data
data = pd.read_feather(filename)
data = data.drop(labels=['index'], axis=1)
# fare amount needs to be bigger than 2.5
data = data.loc[data['fare_amount']>0]
# check the normality of the fare vs speed (taking only 95% quantile)
data['ratio_fare_speed'] = data['fare_amount']/data['speed']
data = data[data['ratio_fare_speed']<data['ratio_fare_speed'].quantile(q=0.975)]
data = data[data['ratio_fare_speed']>data['ratio_fare_speed'].quantile(q=0.025)]
del data['ratio_fare_speed']
# normal speed (max=65)
data = data.loc[data['speed']<65]
# jkf rate
data = data.loc[jkf_rate(data), :]
# newark rate
data = data.loc[newark_rate(data), :]
# huge tolls amount will be removed
data = data.loc[data['tolls_amount']<100, :]
# remove extra that is less than 0
data = data[data['extra']>=0]
data.reset_index().to_feather(filename)
del data
def jkf_rate(data):
all_index = data.index
sliced_data = data[data['RatecodeID']==2]
index_sliced = sliced_data[sliced_data['fare_amount']!=52].index
set_index = set(all_index)-set(index_sliced)
return list(set_index)
def newark_rate(data):
all_index = data.index
sliced_data = data[data['RatecodeID']==3]
index_sliced = sliced_data[sliced_data['fare_amount']<17.5].index
set_index = set(all_index)-set(index_sliced)
return list(set_index)
# second layer of preprocessing
#dates = [['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
dates = [['12', '2016']]
for each_dates in dates:
print('Month:', each_dates[0])
print('Year:', each_dates[1])
second_preprocessing(each_dates[0], each_dates[1], f'yellow_tripdata_{each_dates[1]}-{each_dates[0]}.feather')
def whisk_outliers(att, df):
each_customer_transaction = df[att]
q1 = each_customer_transaction.quantile(q=0.25)
q3 = each_customer_transaction.quantile(q=0.75)
iqr = q3 - q1
left_bound_min = q1 - 1.5*iqr
right_bound_max = q3 + 1.5*iqr
return [left_bound_min, right_bound_max]
taxi = pd.read_feather('yellow_tripdata_2018-12.feather')
taxi.drop(labels=['index', 'total_amount', 'payment_type'], axis=1, inplace = True)
taxi.info()
del taxi
taxi[taxi['fare_dist_ratio'] > 200]
taxi[taxi['fare_dist_ratio']>200]
index 22928 may indicate a cancelled trip from airport, but it is still recorded
#taxi['tpep_pickup_datetime'][0].time().hour
taxi['tpep_pickup_datetime'].dt.day_name()
taxi.columns
def third_preprocessing(month, year, filename):
# read the data first
data = pd.read_feather(filename)
data.drop(labels=['index', 'payment_type', 'total_amount'], inplace = True, axis=1)
# Only take normal fare distance relation
data['fare_dist_ratio'] = data['fare_amount']/data['trip_distance']
data = data.loc[data['fare_dist_ratio']<200, :] # by inspection
data['hour'] = data['tpep_pickup_datetime'].apply(lambda x: x.time().hour)
data['day'] = data['tpep_pickup_datetime'].dt.day_name()
data.drop(labels=['fare_dist_ratio'], axis=1, inplace = True)
data.reset_index().to_feather(filename)
del data
taxi = third_preprocessing('01', '2017', 'yellow_tripdata_2017-01.feather')
taxi.info()
# second layer of preprocessing
dates = [['12', '2016'],['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
for each_dates in dates:
print('Month:', each_dates[0])
print('Year:', each_dates[1])
third_preprocessing(each_dates[0], each_dates[1], f'yellow_tripdata_{each_dates[1]}-{each_dates[0]}.feather')
def fourth_preprocessing(month, year):
"""
This preprocessing is mainly creating multiple dataframe for attribute investigation
"""
# PU-DOlocation
data = pd.read_feather(f'yellow_tripdata_{year}-{month}.feather')
data.drop(labels=['index'], inplace = True, axis=1)
data['trip_count'] = 1
date_pu_do_freq = data.groupby(by=['PULocationID', 'DOLocationID', 'Date', 'RatecodeID']).sum().reset_index()
date_pu_do_freq = date_pu_do_freq.loc[:, ['PULocationID', 'DOLocationID', 'Date', 'RatecodeID', 'trip_count',
'trip_distance']]
date_pu_do_mean = data.groupby(by=['PULocationID', 'DOLocationID', 'Date']).mean().reset_index()
date_pu_do_mean.drop(labels=['RatecodeID', 'hour', 'trip_count'], axis=1, inplace = True)
date_pu_do_freq.reset_index().to_feather(f'pu-do-date-{month}-{year}-freq.feather')
date_pu_do_mean.reset_index().to_feather(f'pu-do-date-{month}-{year}-mean.feather')
del date_pu_do_freq
del date_pu_do_mean
# hour day
hour_day_pu_do_freq = data.groupby(by=['PULocationID', 'DOLocationID', 'day', 'hour']).sum().reset_index()
hour_day_pu_do_mean = data.groupby(by=['PULocationID', 'DOLocationID', 'day', 'hour']).mean().reset_index()
hour_day_pu_do_freq = hour_day_pu_do_freq.loc[:, ['PULocationID', 'DOLocationID', 'day', 'hour', 'trip_count']]
hour_day_pu_do_mean.drop(labels=['trip_count', 'RatecodeID'], axis=1, inplace = True)
hour_day_pu_do_freq.reset_index().to_feather(f'hour-day-{month}-{year}-freq.feather')
hour_day_pu_do_mean.reset_index().to_feather(f'hour-day-{month}-{year}-mean.feather')
del hour_day_pu_do_freq
del hour_day_pu_do_mean
# RatecodeID
rate_id_freq = data.groupby(by=['RatecodeID', 'PULocationID', 'day', 'hour']).mean().reset_index()
rate_id_freq.drop(labels=['trip_count'], axis=1, inplace = True)
rate_id_freq.reset_index().to_feather(f'rate-id-{month}-{year}-mean.feather')
del rate_id_freq
del data
dates = [['12', '2016'],['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
for each_dates in dates:
print('Month:', each_dates[0])
print('Year:', each_dates[1])
fourth_preprocessing(each_dates[0], each_dates[1])
file_name = ['pu-do-date-mean.feather',
'hour-day-freq.feather', 'hour-day-mean.feather',
'rate-id-mean.feather']
for each_file in file_name:
dates = [['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'],
['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
if each_file[:5]=='pu-do':
metric = each_file[11:15]
data = pd.read_feather(f"pu-do-date-12-2016-{metric}.feather")
data.drop(labels=['index'], axis=1, inplace = True)
for each_date in dates:
open_data = pd.read_feather(f"pu-do-date-{each_date[0]}-{each_date[1]}-{metric}.feather")
open_data.drop(labels=['index'], axis=1, inplace = True)
data = pd.concat([data, open_data])
data.reset_index().to_feather(each_file)
elif each_file[:8]=='hour-day':
metric = each_file[9:13]
data = pd.read_feather(f"hour-day-12-2016-{metric}.feather")
data.drop(labels=['index'], axis=1, inplace = True)
for each_date in dates:
open_data = pd.read_feather(f"hour-day-{each_date[0]}-{each_date[1]}-{metric}.feather")
open_data.drop(labels=['index'], axis=1, inplace = True)
data = pd.concat([data, open_data])
data.reset_index().to_feather(each_file)
else:
data = pd.read_feather(f"rate-id-12-2016-mean.feather")
data.drop(labels=['index'], axis=1, inplace = True)
for each_date in dates:
open_data = pd.read_feather(f"rate-id-{each_date[0]}-{each_date[1]}-mean.feather")
open_data.drop(labels=['index'], axis=1, inplace = True)
data = pd.concat([data, open_data])
data.reset_index().to_feather(each_file)
del data
del open_data
taxi = pd.read_feather("pu-do-date-mean.feather")
pu_do_freq = pd.read_feather('pu-do-date-freq.feather')
pu_do_mean = pd.read_feather('pu-do-date-mean.feather')
rate_id = pd.read_feather('rate-id-mean.feather')
hour_day_freq = pd.read_feather('hour-day-freq.feather')
hour_day_mean = pd.read_feather('hour-day-mean.feather')
# external dataset
snow = pd.read_csv('snow.csv')
snow = snow[snow.columns[2:]]
# shapefile and taxi zone
sf = gpd.read_file("taxi_file/taxi_zones.shp")
zone = pd.read_csv('taxi_file/taxi+_zone_lookup.csv')
zone_dic = {}
for index, row in zone.iterrows():
zone_dic[row[0]] = row[1]
# helper function
def whisk_outliers(att, df):
each_customer_transaction = df[att]
q1 = each_customer_transaction.quantile(q=0.25)
q3 = each_customer_transaction.quantile(q=0.75)
iqr = q3 - q1
left_bound_min = q1 - 1.5*iqr
right_bound_max = q3 + 1.5*iqr
return [left_bound_min, right_bound_max]
# conduct minor test by assuming both has normal distribution with unknown mean and std
# test hypothesis mean difference = 0
# this code was taken from https://pythonfordatascienceorg.wordpress.com/welch-t-test-python-pandas/
# and modified to match the purpose of this assignment
def welch_ttest(x, y, equal_var=False):
## Welch-Satterthwaite Degrees of Freedom ##
dof = (x.var()/x.size + y.var()/y.size)**2 / ((x.var()/x.size)**2 / (x.size-1) + (y.var()/y.size)**2 / (y.size-1))
t, p = stats.ttest_ind(x, y, equal_var = equal_var)
print("\n",
f"Welch's t-test= {t:.4f}", "\n",
f"p-value = {p:.8f}", "\n",
f"Welch-Satterthwaite Degrees of Freedom= {dof:.4f}")
grouped_date = pu_do_mean.groupby(by='Date').median().reset_index()
grouped_date = pd.merge(grouped_date, snow, left_on='Date', right_on='Date')
grouped_date_freq = pu_do_freq.groupby(by='Date').sum().reset_index()
grouped_date['trip_count'] = grouped_date_freq['trip_count']
plt.figure(figsize=(10, 6))
# this code is taken from mast30034 tutorial
sns.heatmap(grouped_date[['trip_distance', 'trip_count', 'MaxTemp', 'MinTemp',
'Average', 'New Snow', 'Snow Depth']].corr())
#plt.savefig('plot/correlation_plot.png')
plt.show()
grouped_date[['trip_distance', 'trip_count', 'MaxTemp', 'MinTemp',
'Average', 'New Snow', 'Snow Depth']].corr()
pu_do_freq.drop(labels=['index'], axis=1, inplace=True)
grouped_date = pu_do_freq.groupby(by='Date').sum().reset_index()
grouped_date = pd.merge(grouped_date, snow, left_on='Date', right_on='Date')
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(grouped_date.index, grouped_date['trip_count'],"b-")
ax.axvline(90, color='g', linestyle='--', label='2017-2018')
ax.axvline(180, color='r', linestyle='--', label='2018-2019')
ticks = [i for i in range(0, 270, 30)]
ticks.append(269)
ax.set_xticks(ticks)
ax.set_xticklabels(list(grouped_date.loc[ticks, 'Date']))
ax.set_title('Time series plot on the trip frequency of New York Yellow Taxi\n during December-February from 2016-2019\n', fontsize=18)
ax.set_ylabel('Trip frequency')
ax.set_xlabel('Date')
ax.scatter(x=[24, 114, 204], y=[153998, 136866, 113671], c='r', label='Christmas')
plt.legend()
plt.annotate('391434 trips\n 2016-12-16\n Min temp: -7 Celcius', xy=(15, 391434), xytext=(50, 375000),
arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('113671 trips\n 2018-12-25\n Min temp: -1 Celcius', xy=(204, 113671), xytext=(220, 113671),
arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('Bomb Cyclone\n114150 trips\n2018-01-04\nSnow fall rate: 9 inch', xy=(124, 114150), xytext=(130, 135000),
arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('NA Blizzard\n192959 trips\n2017-02-09\nSnow fall rate: 10.3 inch', xy=(70, 192959), xytext=(35, 147500),
arrowprops=dict(facecolor='green', shrink=0.05))
#plt.savefig('plot/time_series_freq.png')
plt.show()
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot([grouped_date.loc[0:90, 'trip_count'],grouped_date.loc[90:180, 'trip_count'],
grouped_date.loc[180:, 'trip_count']])
ax.set_xticklabels(['2016-2017', '2017-2018', '2018-2019'])
plt.ylabel('Trip count')
plt.title('Boxplot of trip frequency during different year period in December-February\n', fontsize=18)
# Save the figure
#fig.savefig('plot/boxplot_freq.png', bbox_inches='tight')
plt.show()
for i in range(0, 270, 90):
print(f'Statistical inference of each period:')
print(grouped_date.loc[i:i+90, 'trip_count'].describe())
print('\n')
index_outliers = []
index_period = [[0, 90], [90]]
for i in range(0, 270, 90):
sliced_df = grouped_date.loc[i:i+90, :]
lb, ub = whisk_outliers('trip_count', sliced_df)
outliers_ub = sliced_df[sliced_df['trip_count']>ub]
if len(list(outliers_ub.index))>0:
index_outliers.append(list(outliers_ub.index))
outliers_lb = sliced_df[sliced_df['trip_count']<lb]
if len(list(outliers_lb.index))>0:
index_outliers.append(list(outliers_lb.index))
index_outliers = [j for i in index_outliers for j in i]
grouped_date.loc[index_outliers, :]
#find the big drop
find_perc = grouped_date.copy()
find_perc['perc_change'] = find_perc['trip_count'].pct_change()
find_perc.dropna(inplace = True)
find_perc[find_perc['perc_change']<0].sort_values(by='perc_change', ascending=True)[:5]
try_predict = find_perc['trip_count'][1:]
perc_change = find_perc['trip_count'][:-1]
plt.scatter(try_predict, perc_change)
plt.ylabel("Next day frequency")
plt.xlabel("Previous frequency")
plt.title("Scatter plot of previous trip frequency vs next day trip frequency\n", fontsize=14)
plt.show()
np.corrcoef(try_predict, perc_change)
#find which day has the most decrease in trip demand
neg_dem = find_perc[find_perc['perc_change']<0]
neg_dem['Date'] = pd.to_datetime(neg_dem['Date'])
neg_dem['day'] = neg_dem['Date'].apply(lambda x: x.day_name())
pos_dem = find_perc[find_perc['perc_change']>0]
pos_dem['Date'] = pd.to_datetime(pos_dem['Date'])
pos_dem['day'] = pos_dem['Date'].apply(lambda x: x.day_name())
mode_day_neg = pd.DataFrame(neg_dem['day'].value_counts())
mode_day_neg['day'] = round((mode_day_neg['day']/mode_day_neg['day'].sum()), 2)
mode_day_neg = mode_day_neg.loc[['Monday', 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], :]
mode_day_pos = pd.DataFrame(pos_dem['day'].value_counts())
mode_day_pos['day'] = round((mode_day_pos['day']/mode_day_pos['day'].sum()), 2)
mode_day_pos = mode_day_pos.loc[['Monday', 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday"], :]
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
fig.suptitle('Tally on percentage increase/decrease of trip frequency by day\n', fontsize=18)
ax[0].plot(mode_day_neg, color='r', label='Decrease')
ax[1].plot(mode_day_pos, color='g', label='Increase')
for ax in ax.flat:
ax.set(ylabel='Percentage (%)')
ax.legend()
#fig.savefig('plot/tally_percentage_trip_daily.png')
plt.show()
# check the quantil" plot for normality check
sm.qqplot(grouped_date.loc[0:90, 'trip_count'], line ='45')
py.show()
print('Test whether the mean difference of trip frequency\nbetween 2016-2017 and 2018-2019:\n')
print('Normal distribution test for 2016-2017:')
print(stats.shapiro(grouped_date.loc[0:90, 'trip_count']))
print('\nNormal distribution test for 2018-2019:')
print(stats.shapiro(grouped_date.loc[180:, 'trip_count'])) #game day may have big p-value, but still stick to normal for simplicity
#welch_ttest(grouped_date.loc[0:90, 'trip_count'], grouped_date.loc[180:, 'trip_count'])
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(grouped_date.index, grouped_date['MinTemp'],"b-")
ax.axvline(90, color='g', linestyle='--', label='2017-2018')
ax.axvline(180, color='r', linestyle='--', label='2018-2019')
ticks = [i for i in range(0, 239, 30)]
ticks.append(238)
ax.set_xticks(ticks)
ax.set_xticklabels(list(grouped_date.loc[ticks, 'Date']))
ax.set_title('Time series plot on the minimum temperature forecast of New York December-Februrary 2016-2019\n', fontsize=18)
ax.set_ylabel('Temperature (Celcius)')
ax.set_xlabel('Date')
plt.legend()
plt.annotate('-18 degree celcius\n 2018-01-07\n Trip: 222426', xy=(127, -18), xytext=(140, -18),
arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('8 degree celcius\n 2017-12-05\n Trip: 308487', xy=(94, 8), xytext=(110, 5),
arrowprops=dict(facecolor='green', shrink=0.05))
#plt.savefig('time_series_mintemp.png')
plt.show()
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(grouped_date.index, grouped_date['Snow Depth'],"b-")
ax.axvline(90, color='g', linestyle='--', label='2017-2018')
ax.axvline(180, color='r', linestyle='--', label='2018-2019')
ticks = [i for i in range(0, 239, 30)]
ticks.append(238)
ax.set_xticks(ticks)
ax.set_xticklabels(list(grouped_date.loc[ticks, 'Date']))
ax.set_title('Time series plot on the snow depth forecast of New York December-Februrary 2016-2019\n', fontsize=18)
ax.set_ylabel('Depth (inch)')
ax.set_xlabel('Date')
plt.legend()
plt.annotate('9.0 inch snow\n 2017-02-10\n Trip: 318808', xy=(71, 9.0), xytext=(20, 7),
arrowprops=dict(facecolor='green', shrink=0.05))
#plt.annotate('8 degree celcius\n 2017-12-05\n Trip: 308487', xy=(94, 8), xytext=(110, 5),
#arrowprops=dict(facecolor='green', shrink=0.05))
#plt.savefig('time_series_snowdepth.png')
plt.show()
plt.figure(figsize=(12, 8))
correlation = np.corrcoef(grouped_date['MinTemp'], grouped_date['trip_count'])[0][-1]
ax = sns.regplot(x='MinTemp', y='trip_count', data=grouped_date, color="b",
label=f'Correlation: {round(correlation, 2)}')
plt.title('Scatter plot of trip frequency vs minimum forecast temperature\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.xlabel('Minimum temperature (Celcius)')
plt.legend()
#plt.savefig('freq_mintemp_scatter_normal.png')
plt.show()
# trip count statistic inference for temperature 0
grouped_date[grouped_date['MinTemp']==0].describe()['trip_count']
def label_snow_fall(x):
if x<2: return 1
elif x>=2 and x<4: return 2
elif x>=4 and x<6: return 3
else: return 4
grouped_date['SnowFallBins'] = grouped_date['New Snow'].apply(lambda x: label_snow_fall(x))
grouped_bins_snow_fall = grouped_date.groupby(by='SnowFallBins').mean().reset_index()
stats_analysis = pd.DataFrame(grouped_date[grouped_date['SnowFallBins']==1].describe()['trip_count'])
stats_analysis.columns = ['0-2 inch']
stats_analysis['2-4 inch'] = grouped_date[grouped_date['SnowFallBins']==2].describe()['trip_count']
stats_analysis['4-6 inch'] = grouped_date[grouped_date['SnowFallBins']==3].describe()['trip_count']
stats_analysis['>=6 inch'] = grouped_date[grouped_date['SnowFallBins']==4].describe()['trip_count']
stats_analysis
plt.figure(figsize=(12, 8))
labels=["0-2","2-4", "4-6", ">=6"]
ax = sns.barplot(x='SnowFallBins', y='trip_count', data=grouped_bins_snow_fall)
plt.title('Bar plot of trip frequency grouped by four range of snow fall rate\n', fontsize=18)
plt.ylabel('Trip frequency')
#plt.xtickslabels(labels)
plt.xlabel('inch')
ax.set_xticklabels(labels)
plt.savefig('plot/bar_average_trip_snow_fall.png')
plt.show()
This code was inspired by https://www.analyticsvidhya.com/blog/2020/06/guide-geospatial-analysis-folium-python/
sliced_christmas = pu_do_freq[pu_do_freq['Date'].isin(['2018-12-24','2018-12-25','2018-12-26'])]
sliced_christmas = sliced_christmas.groupby(by=['Date','PULocationID']).sum().reset_index()
sliced_christmas['PULocationID'] = sliced_christmas['PULocationID'].astype('str')
# seperate the data into bins first
bins=np.linspace(min(sliced_christmas['trip_count']),max(sliced_christmas['trip_count']),11)
# Coloring the location id based on certain attribute
sliced_christmas['color']=pd.cut(sliced_christmas['trip_count'],
bins,labels=['#FFEBEB','#F8D2D4','#F2B9BE','#EBA1A8','#E58892','#DE6F7C',
'#D85766','#D13E50','#CB253A','#C50D24'],include_lowest=True)
sliced_christmas = sliced_christmas[['Date','PULocationID','color']]
# find the DOLocationID that has no trip count on that day
for date in sliced_christmas['Date'].unique():
diff=set(zone['LocationID'].astype(str))-set(sliced_christmas[sliced_christmas['Date']==date]['PULocationID'])
for i in diff:
grouped_game=pd.concat([sliced_christmas,pd.DataFrame([[date,i,'#0073CF']],columns=['Date','PULocationID','color'])],ignore_index=True)
sliced_christmas.sort_values('Date',inplace=True)
sliced_christmas['Date'] = pd.to_datetime(sliced_christmas['Date'])
sliced_christmas['Date']=(sliced_christmas['Date'].astype(int)// 10**9).astype('U10')
# get the style dict
trip_dict={}
for i in sliced_christmas['PULocationID'].unique():
trip_dict[i]={}
for j in sliced_christmas[sliced_christmas['PULocationID']==i].set_index(['PULocationID']).values:
trip_dict[i][j[0]]={'color':j[1],'opacity':0.7}
# this code was recycled and edited from MAST30034 tutorial
# sf stands for shape file
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf['LocationID'] = sf['LocationID'].astype('str')
geoJSON = sf[['LocationID','geometry']].set_index('LocationID').to_json()
# plot the slider map
fig6=Figure(height=850,width=1000)
m6 = folium.Map([40.66, -73.94], tiles='cartodbpositron', zoom_start=10)
fig6.add_child(m6)
g = TimeSliderChoropleth(
geoJSON,
styledict=trip_dict,
).add_to(m6)
folium.Marker(location=[40.778805,-73.9703917],
tooltip='<strong>Manhattan</strong>',
icon=folium.Icon(color='red',prefix='fa', icon='star')).add_to(m6)
folium.Marker(location=[40.6413111,-73.7803278],
tooltip='<strong>JFK NY Airport</strong>',
icon=folium.Icon(color='purple',prefix='fa',icon='plane')).add_to(m6)
folium.Marker(location=[40.6895314,-74.1766511],
tooltip='<strong>Newark NJ Airport</strong>',
icon=folium.Icon(color='purple',prefix='fa',icon='plane')).add_to(m6)
folium.Marker(location=[40.7769271,-73.8761546],
tooltip='<strong>LaGuardia Airport</strong>',
icon=folium.Icon(color='purple',prefix='fa',icon='plane')).add_to(m6)
#m6.save('plot/last_christmas_timeslider.html')
m6
pu_do_freq['PUBorough'] = pu_do_freq['PULocationID'].apply(lambda x: zone_dic[x])
pu_do_freq['DOBorough'] = pu_do_freq['DOLocationID'].apply(lambda x: zone_dic[x])
sliced_christmas = pu_do_freq[pu_do_freq['Date'].isin(['2018-12-24','2018-12-25','2018-12-26'])]
sliced_christmas_date = sliced_christmas.groupby(by=['Date']).sum().reset_index()
# check the median decrease in each day
sliced_christmas
# check the borough
sliced_christmas = pu_do_freq.groupby(by=['PUBorough', 'Date']).sum().reset_index()
sliced_christmas.loc[sliced_christmas['Date'].isin(['2018-12-24','2018-12-25','2018-12-26'])]
# check languardia: 138 and jfk: 132
id_airports = [132, 138]
sliced_christmas[sliced_christmas['PULocationID'].isin(id_airports)]
Chropleth of most impacted area due to blizzard
disaster = ['2018-01-04', '2017-02-09']
set_no_blizzard = set(pu_do_freq.index)-set(pu_do_freq[pu_do_freq['Date'].isin(disaster)].index)
normal_trend = pu_do_freq.loc[set_no_blizzard, :]
blizzard_trend = pu_do_freq[pu_do_freq['Date'].isin(disaster)]
normal_trend = normal_trend.groupby(by=['PULocationID', 'Date']).sum().reset_index()
blizzard_trend = blizzard_trend.groupby(by=['PULocationID', 'Date']).sum().reset_index()
normal_trend = normal_trend.groupby(by=['PULocationID']).mean().reset_index()
blizzard_trend = blizzard_trend.groupby(by=['PULocationID']).mean().reset_index()
normal_trend['trip_count'] = normal_trend['trip_count'].astype('int')
blizzard_trend['trip_count'] = blizzard_trend['trip_count'].astype('int')
id_lst = [i for i in blizzard_trend['PULocationID'] if i in normal_trend['PULocationID']]
normal_trend = normal_trend[normal_trend['PULocationID'].isin(id_lst)]
blizzard_trend = blizzard_trend[blizzard_trend['PULocationID'].isin(id_lst)]
normal_trend.set_index('PULocationID', inplace = True)
blizzard_trend.set_index('PULocationID', inplace = True)
normal_trend['diff'] = blizzard_trend['trip_count']/normal_trend['trip_count']
normal_trend.reset_index(inplace = True)
#normal_trend['diff'] = np.sqrt(normal_trend['diff'])
normal_trend['PULocationID'] = normal_trend['PULocationID'].astype('str')
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(pd.merge(normal_trend, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
# folium requires a geo JSON format for the geo_data arg, read more about it from the documentations
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
m = folium.Map(location=[40.66, -73.94], tiles="cartodbpositron", zoom_start=10)
# refer to the folium documentations on how to plot aggregated data.
folium.Choropleth(
geo_data=geoJSON,
name='choropleth',
data=normal_trend,
columns=['PULocationID', 'diff'],
key_on='feature.properties.LocationID',
fill_color='YlGnBu',
fill_opacity=0.7,
line_opacity=0.2,
zoomOut=False,
legend_name='Daily decrease/increase trip frequency'
).add_to(m)
folium.LayerControl().add_to(m)
folium.Marker(location=[40.7769271,-73.8761546],
tooltip='<strong>LaGuardia Airport: 1038 trips</strong>',
icon=folium.Icon(color='red',prefix='fa',icon='plane')).add_to(m)
folium.Marker(location=[40.682334,-73.8826597],
tooltip='<strong>Cypress Hills: 22 trips</strong>',
icon=folium.Icon(color='blue',prefix='fa',icon='info')).add_to(m)
folium.Marker(location=[40.760338,-73.9536737],
tooltip='<strong>Rosevelt Island: 17 trips</strong>',
icon=folium.Icon(color='blue',prefix='fa',icon='info')).add_to(m)
folium.Marker(location=[40.830321,-73.9254947],
tooltip='<strong>West Concourse: 41 trips</strong>',
icon=folium.Icon(color='blue',prefix='fa',icon='info')).add_to(m)
#m.save('plot/blizzard_map.html')
m
normal_trend.sort_values(by='diff', ascending=False)[:20]
# percentage decrease in each borough
normal_trend['PULocationID'] = normal_trend['PULocationID'].astype(int)
normal_trend['PUBorough'] = normal_trend['PULocationID'].apply(lambda x: zone_dic[x])
borough_groupby = normal_trend.groupby(by='PUBorough').median()
1-borough_groupby['diff']
# 91% taxi trips are from manhattan
trip_borough = pu_do_freq.groupby(by='PUBorough').sum()['trip_count']
trip_borough/trip_borough.sum()
# most increase
zone[zone['LocationID'].isin([63, 202, 247])]
# rosevelt island
print(normal_trend[normal_trend['PULocationID']==202])
print(blizzard_trend.loc[202])
# laguardia
blizzard_trend.loc[[138]]
# laguardia
normal_trend.loc[normal_trend['PULocationID']==138]
hour_day_freq.head()
hour_day_freq = pd.merge(hour_day_freq, snow, left_on='Date', right_on='Date')
daily = hour_day_freq.groupby(by=['Date', 'day']).sum()
daily_mean = daily.groupby(by=['day']).mean()
daily_mean = daily_mean[['trip_count']]
daily_mean = daily_mean.loc[['Monday', 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], :]
# this is a tutorial code on how to use violin plots, taken from https://app.mode.com/modeanalytics/reports/76bb957b935c/details/notebook
sns.set(style="whitegrid")
daily.reset_index(inplace = True)
f, ax = plt.subplots(figsize=(7, 7))
# Show each distribution with both violins and points
sns.violinplot(x="day",y="trip_count",data=daily,
inner="box", palette="Set3", cut=2, linewidth=3)
sns.despine(left=True)
f.suptitle('Average daily trip frequency', fontsize=18, fontweight='bold')
ax.set_ylabel("Trip frequency",size = 16,alpha=0.7)
stats_analysis = pd.DataFrame(daily[daily['day']=='Monday'].describe()['trip_count'])
stats_analysis.columns = ['Monday']
stats_analysis['Tuesday'] = daily[daily['day']=='Tuesday'].describe()['trip_count']
stats_analysis['Wednesday'] = daily[daily['day']=="Wednesday"].describe()['trip_count']
stats_analysis['Thursday'] = daily[daily['day']=="Thursday"].describe()['trip_count']
stats_analysis['Friday'] = daily[daily['day']=="Friday"].describe()['trip_count']
stats_analysis['Saturday'] = daily[daily['day']=="Saturday"].describe()['trip_count']
stats_analysis['Sunday'] = daily[daily['day']=="Sunday"].describe()['trip_count']
stats_analysis
plt.figure(figsize=(10, 6))
days = [('Monday', 'b'), ('Tuesday', 'g'), ("Wednesday", 'r'), ("Thursday", 'c'),
("Friday", 'm'), ("Saturday", 'y'), ("Sunday", 'k')]
for each_day in days:
day_freq = hour_day_freq[hour_day_freq['day']==each_day[0]]
day_freq = day_freq.groupby(by=['Date', 'hour']).sum()
day_freq = day_freq.groupby(by=['hour']).mean()
day_freq = day_freq[['trip_count']]
plt.plot(day_freq, label=each_day[0], c=each_day[-1])
plt.title('Average hourly trip frequency in each day\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.legend()
plt.savefig('plot/trip_freq_daily_hourly_trend.png')
plt.show()
hour_day_freq = pd.merge(hour_day_freq, snow, left_on='Date', right_on='Date')
hour_day_freq.head()
snow_day = hour_day_freq[hour_day_freq['New Snow']>0]
no_snow_day = hour_day_freq[hour_day_freq['New Snow']==0]
snow_day = snow_day.groupby(by=['Date', 'hour']).sum()
snow_day = snow_day.groupby(by=['hour']).mean()
no_snow_day = no_snow_day.groupby(by=['Date', 'hour']).sum()
no_snow_day = no_snow_day.groupby(by=['hour']).mean()
snow_day = snow_day[['trip_count']]
no_snow_day = no_snow_day[['trip_count']]
plt.figure(figsize=(10, 6))
plt.plot(snow_day, label='Snow day', c='b')
plt.plot(no_snow_day, label='No Snow day', c='r')
plt.title('Average Trip frequency hourly trend during snow and no snow day\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.legend()
#plt.savefig('plot/trip_freq_snow_nosnow_trend.png')
plt.show()
# split the hour into 2, 8am-7pm, 8pm-7am
non_working_hour = [20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7]
working_hour = list(range(8, 20))
working_hour = hour_day_freq[hour_day_freq['hour'].isin(working_hour)]
non_working_hour = hour_day_freq[hour_day_freq['hour'].isin(non_working_hour)]
working_hour = working_hour.groupby(by=['PULocationID', 'Date']).sum().reset_index()
working_hour = working_hour.groupby(by=['PULocationID']).mean().reset_index()
non_working_hour = non_working_hour.groupby(by=['PULocationID', 'Date']).sum().reset_index()
non_working_hour = non_working_hour.groupby(by=['PULocationID']).mean().reset_index()
working_hour['trip_count'] = working_hour['trip_count'].astype('int')
non_working_hour['trip_count'] = non_working_hour['trip_count'].astype('int')
diff_df = pd.DataFrame([])
diff_df['PULocationID'] = non_working_hour.PULocationID.astype('str')
diff_df['diff'] = np.log(working_hour['trip_count'])-np.log(non_working_hour['trip_count'])
#grouped_do_citi['trip_count'] = np.log(grouped_do_citi['trip_count'])
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(pd.merge(diff_df, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
# folium requires a geo JSON format for the geo_data arg, read more about it from the documentations
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
m = folium.Map(location=[40.66, -73.94], tiles="cartodbpositron", zoom_start=10)
# refer to the folium documentations on how to plot aggregated data.
folium.Choropleth(
geo_data=geoJSON,
name='choropleth',
data=diff_df,
columns=['PULocationID', 'diff'],
key_on='feature.properties.LocationID',
fill_color='YlGnBu',
fill_opacity=0.7,
line_opacity=0.2,
zoomOut=False,
legend_name='Daily decrease/increase trip frequency'
).add_to(m)
folium.LayerControl().add_to(m)
folium.Marker(location=[40.6413111,-73.7803278],
tooltip='<strong>JFK NY Airport: 1103 trips</strong>',
icon=folium.Icon(color='orange',prefix='fa',icon='plane')).add_to(m)
folium.Marker(location=[40.7769271,-73.8761546],
tooltip='<strong>LaGuardia Airport: 1038 trips</strong>',
icon=folium.Icon(color='red',prefix='fa',icon='plane')).add_to(m)
folium.Marker(location=[40.697316,-73.9200917],
tooltip='<strong>Bushwick, Greenpoint, Williamsburg</strong>',
icon=folium.Icon(color='blue',prefix='fa',icon='info')).add_to(m)
folium.Marker(location=[40.721834,-73.9933407],
tooltip='<strong>Lower East Side</strong>',
icon=folium.Icon(color='purple',prefix='fa',icon='info')).add_to(m)
#m.save('plot/blizzard_map.html')
m
compare between holiday, blizzard and normal day
holiday_date = ['2016-12-25', '2017-12-25', '2018-12-25',
'2016-12-31', '2017-12-31', '2018-12-31',
'2016-02-14', '2017-02-14', '2018-02-14']
disaster = ['2018-01-04', '2017-02-09']
appended_date = ['2016-12-25', '2017-12-25', '2018-12-25',
'2016-12-31', '2017-12-31', '2018-12-31',
'2016-02-14', '2017-02-14', '2018-02-14',
'2018-01-04', '2017-02-09']
set_no_holiday = set(hour_day_freq.index)-set(hour_day_freq[hour_day_freq['Date'].isin(appended_date)].index)
holiday_trend = hour_day_freq[hour_day_freq['Date'].isin(holiday_date)]
normal_trend = hour_day_freq.loc[set_no_holiday, :]
blizzard_trend = hour_day_freq[hour_day_freq['Date'].isin(disaster)]
holiday_trend = holiday_trend.groupby(by=['Date', 'hour']).sum()
holiday_trend = holiday_trend.groupby(by=['hour']).mean()
normal_trend = normal_trend.groupby(by=['Date', 'hour']).sum()
normal_trend = normal_trend.groupby(by=['hour']).mean()
blizzard_trend = blizzard_trend.groupby(by=['Date', 'hour']).sum()
blizzard_trend = blizzard_trend.groupby(by=['hour']).mean()
holiday_trend = holiday_trend[['trip_count']]
normal_trend = normal_trend[['trip_count']]
blizzard_trend = blizzard_trend[['trip_count']]
plt.figure(figsize=(10, 6))
plt.plot(normal_trend, label='Normal', c='b')
plt.plot(holiday_trend, label='Holiday', c='r')
plt.plot(blizzard_trend, label='Blizzard', c='g')
plt.title('Average Trip frequency hourly trend during holiday, normal and blizzard\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.legend()
#plt.savefig('plot/trip_freq_diff_day.png')
plt.show()
stats_df = pd.DataFrame(blizzard_trend.loc[4:8].describe())
stats_df.columns = ['Blizzard_4-9am']
stats_df['Holiday_4-9am'] = holiday_trend.loc[4:8].describe()
stats_df
grouped_date = pu_do_mean.groupby(by='Date').median().reset_index()
grouped_date = pd.merge(grouped_date, snow, left_on='Date', right_on='Date')
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(grouped_date.index, grouped_date['trip_distance'],"b-")
ax.axvline(90, color='g', linestyle='--', label='2017-2018')
ax.axvline(180, color='r', linestyle='--', label='2018-2019')
ticks = [i for i in range(0, 270, 30)]
ticks.append(269)
ax.set_xticks(ticks)
ax.set_xticklabels(list(grouped_date.loc[ticks, 'Date']))
ax.set_title('Time series plot on the median trip distance of New York Yellow Taxi\n during December-February from 2016-2019\n', fontsize=18)
ax.set_ylabel('Trip distance (miles)')
ax.set_xlabel('Date')
ax.scatter(x=grouped_date[grouped_date['Snow Depth']>5].index,
y=grouped_date[grouped_date['Snow Depth']>5]['trip_distance'], c='r', label='Thick Snow')
ax.scatter(x=grouped_date[grouped_date['New Snow']>5].index,
y=grouped_date[grouped_date['New Snow']>5]['trip_distance'], c='g', label='Heavy Snow Fall')
plt.legend()
plt.annotate('Bomb Cyclone\n4.59 miles\n2018-01-04\nSnow fall rate: 9 inch', xy=(124, 4.585), xytext=(135, 4.585),
arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('NA Blizzard\n5.08 miles\n2017-02-09\nSnow fall rate: 10.3 inch', xy=(70, 5.075556), xytext=(35, 4.9),
arrowprops=dict(facecolor='green', shrink=0.05))
#plt.savefig('plot/time_series_dist.png')
plt.show()
grouped_date[grouped_date['New Snow']>5]
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot([grouped_date.loc[0:90, 'trip_distance'],grouped_date.loc[90:180, 'trip_distance'],
grouped_date.loc[180:, 'trip_distance']])
ax.set_xticklabels(['2016-2017', '2017-2018', '2018-2019'])
plt.ylabel('Trip distance (miles)')
plt.title('Boxplot of average total daily trip distance during\ndifferent year period in December-February\n', fontsize=18)
# Save the figure
#fig.savefig('plot/boxplot_dist.png', bbox_inches='tight')
plt.show()
# central tendency here!!!
date = [0, 90, 180]
for i in date:
print('Descriptive analysis of each year period')
print(grouped_date.loc[i:i+90, 'trip_distance'].describe())
print("\n")
print('Test whether the mean difference of average trip distance\nbetween 2016-2017 and 2018-2019:\n')
print('Normal distribution test for 2016-2017:')
print(stats.shapiro(grouped_date.loc[0:90, 'trip_distance']))
print('\nNormal distribution test for 2018-2019:')
print(stats.shapiro(grouped_date.loc[180:, 'trip_distance'])) #game day may have big p-value, but still stick to normal for simplicity
welch_ttest(grouped_date.loc[0:90, 'trip_distance'], grouped_date.loc[180:, 'trip_distance'])
# check outliers
index_outliers = []
index_period = [[0, 90], [90]]
for i in range(0, 270, 90):
sliced_df = grouped_date.loc[i:i+90, :]
lb, ub = whisk_outliers('trip_distance', sliced_df)
outliers_ub = sliced_df[sliced_df['trip_distance']>ub]
if len(list(outliers_ub.index))>0:
index_outliers.append(list(outliers_ub.index))
outliers_lb = sliced_df[sliced_df['trip_distance']<lb]
if len(list(outliers_lb.index))>0:
index_outliers.append(list(outliers_lb.index))
index_outliers = [j for i in index_outliers for j in i]
grouped_date.loc[index_outliers, :]
grouped_date_freq = pu_do_freq.groupby(by=['Date']).sum().reset_index()
grouped_date_freq['trip_distance_mean'] = grouped_date['trip_distance']
grouped_date_freq = pd.merge(grouped_date_freq, snow, left_on='Date', right_on='Date')
grouped_date_freq['yes_snow'] = grouped_date_freq['Snow Depth'].apply(lambda x: 'Snow' if x else 'No snow')
# remove outliers by inspection
grouped_date_freq = grouped_date_freq[grouped_date_freq['trip_distance']>5]
grouped_date_freq = grouped_date_freq[grouped_date_freq['trip_count']>133251]
# correlation between trip frequency and trip distance during snow
np.corrcoef(grouped_date_freq[grouped_date_freq['yes_snow']=='Snow']['trip_count'],
grouped_date_freq[grouped_date_freq['yes_snow']=='Snow']['trip_distance_mean'])
# correlation between trip frequency and trip distance during no snow
np.corrcoef(grouped_date_freq[grouped_date_freq['yes_snow']=='No snow']['trip_count'],
grouped_date_freq[grouped_date_freq['yes_snow']=='No snow']['trip_distance_mean'])
plt.figure(figsize=(12, 8))
grouped_date_freq['trip_count_log'] = np.log(grouped_date_freq['trip_count'])
correlation = np.corrcoef(grouped_date_freq['trip_count_log'], grouped_date_freq['trip_distance_mean'])[0][-1]
ax = sns.lmplot(x="trip_distance_mean", y="trip_count_log",
data=grouped_date_freq, fit_reg=True, hue='yes_snow', legend=True)
#sns.regplot(x=np.log(grouped_date_freq['trip_count']), y='trip_distance_mean', data=grouped_date_freq, color="b",
#label=f'Correlation: {round(correlation, 2)}')
plt.title('Scatter plot of log transformed trip frequency \nvs average total daily trip distance\n', fontsize=14)
plt.xlabel('Trip distance (miles)')
plt.ylabel('Log transformed trip frequency')
plt.legend()
#plt.savefig('plot/dist_freq_snow.png')
plt.show()
pu_do_mean = pd.merge(pu_do_mean, snow, left_on='Date', right_on='Date')
snow_trip_sliced = pu_do_mean[pu_do_mean['Snow Depth']>=5]
no_snow_trip_sliced = pu_do_mean[pu_do_mean['Snow Depth']<5]
snow_trip = snow_trip_sliced.groupby(by=['PULocationID']).median().reset_index()
no_snow_trip = no_snow_trip_sliced.groupby(by=['PULocationID']).median().reset_index()
snow_trip = snow_trip[['PULocationID','trip_distance']]
no_snow_trip = no_snow_trip[['PULocationID','trip_distance']]
snow_trip = pd.merge(snow_trip, no_snow_trip, left_on='PULocationID', right_on='PULocationID')
snow_trip['diff'] = snow_trip['trip_distance_x']-snow_trip['trip_distance_y']
#grouped_do_citi['trip_count'] = np.log(grouped_do_citi['trip_count'])
snow_trip['PULocationID'] = snow_trip['PULocationID'].astype('str')
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(pd.merge(snow_trip, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
# folium requires a geo JSON format for the geo_data arg, read more about it from the documentations
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
m = folium.Map(location=[40.66, -73.94], tiles="cartodbpositron", zoom_start=10)
# refer to the folium documentations on how to plot aggregated data.
folium.Choropleth(
geo_data=geoJSON,
name='choropleth',
data=snow_trip,
columns=['PULocationID', 'diff'],
key_on='feature.properties.LocationID',
fill_color='YlGnBu',
fill_opacity=0.7,
line_opacity=0.2,
zoomOut=False,
legend_name='Decrease/Increase average distance travelled'
).add_to(m)
folium.LayerControl().add_to(m)
#m.save('plot/snow_distance_map.html')
m
snow_trip['PUBorough'] = snow_trip['PULocationID'].astype(int).apply(lambda x: zone_dic[x])
snow_trip.groupby(by='PUBorough').mean()
# check jfk and laguardia
snow_trip[snow_trip['PULocationID'].isin(['132', '138'])]
# bronx park
snow_trip_sliced[snow_trip_sliced['PULocationID']==31]
Check negotiated fare
#check_mean_distance['trip_count'] = 1
grouped_by_date = pu_do_freq.groupby(by=['Date', 'RatecodeID']).sum().reset_index()[['Date', 'RatecodeID', 'trip_count']]
grouped_by_date = grouped_by_date[grouped_by_date['RatecodeID']!=1]
first_per = grouped_by_date.loc[:456]
first_per['RatecodeID'] = first_per['RatecodeID'].astype('str')
second_per = grouped_by_date.loc[456:911]
second_per['RatecodeID'] = second_per['RatecodeID'].astype('str')
third_per = grouped_by_date.loc[911:]
third_per['RatecodeID'] = third_per['RatecodeID'].astype('str')
plt.figure(figsize=(8, 6))
sns.distplot(np.log(first_per[first_per['RatecodeID']=='5']['trip_count']), label='2016-2017')
sns.distplot(np.log(second_per[second_per['RatecodeID']=='5']['trip_count']), label='2017-2018')
sns.distplot(np.log(third_per[third_per['RatecodeID']=='5']['trip_count']), label='2018-2019')
plt.title('Distribution plot of negotiated fare trip frequency\n', fontsize=18)
plt.legend()
#plt.savefig('plot/dist_nego.png')
plt.show()
first_per = first_per.groupby(by='RatecodeID').sum()
second_per = second_per.groupby(by='RatecodeID').sum()
third_per = third_per.groupby(by='RatecodeID').sum()
plt.figure(figsize=(8, 6))
plt.plot(first_per, c='r', label='2016-2017')
plt.plot(second_per, c='b', label='2017-2018')
plt.plot(third_per, c='g', label='2018-2019')
plt.legend()
plt.title('Rate code trip increase/decrease in each period\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.xlabel('Rate code id')
#plt.savefig('plot/rate_trend.png')
plt.show()
# rate code id 5 is the negoiated fare
third_per/first_per
check_distance = rate_id.groupby(by=['RatecodeID']).median()
check_distance
pu_rate_5 = pu_do_freq[pu_do_freq['RatecodeID']==5]
pu_rate_5 = pu_rate_5[pu_rate_5['Date']>='2018-12-01']
pu_rate_5 = pu_rate_5.groupby(by=['PULocationID']).sum().reset_index()
pu_rate_5 = pu_rate_5[['PULocationID', 'trip_count']]
pu_rate_5 = pu_rate_5.sort_values(by='trip_count', ascending = False)
pu_rate_5[:10]
zone[zone['LocationID'].isin(pu_rate_5[:10]['PULocationID'])]
pu_rate_5['PULocationID'] = pu_rate_5['PULocationID'].astype('str')
#grouped_do_citi['trip_count'] = np.log(grouped_do_citi['trip_count'])
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(pd.merge(pu_rate_5, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
# folium requires a geo JSON format for the geo_data arg, read more about it from the documentations
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
m = folium.Map(location=[40.66, -73.94], tiles="cartodbpositron", zoom_start=10)
# refer to the folium documentations on how to plot aggregated data.
folium.Choropleth(
geo_data=geoJSON,
name='choropleth',
data=pu_rate_5,
columns=['PULocationID', 'trip_count'],
key_on='feature.properties.LocationID',
fill_color='YlGnBu',
fill_opacity=0.7,
line_opacity=0.2,
zoomOut=False,
legend_name='Trip frequency'
).add_to(m)
folium.LayerControl().add_to(m)
folium.Marker(location=[40.6413111,-73.7803278],
tooltip='<strong>JFK NY Airport</strong>',
icon=folium.Icon(color='purple',prefix='fa',icon='plane')).add_to(m)
#m.save('plot/rate_code_five.html')
m